#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
import os
import codecs
import csv
import jieba
import nltk
from nltk.corpus import stopwords
from src.constant.file_and_path_constant import FileAndPathConstant

sys.path.append(FileAndPathConstant.System_Drive + '/github-repository/hades/dev-project/jormungandr')

from src.manager.log_manager import LogManager

Logger = LogManager.get_logger(__name__)


class Bayer2Test:
    # excel文件路径
    File_Path = None

    # 数据列的列名
    Data_Column = "Tweet"

    # 标签列的列名
    Label_Column = "Text Label"

    # 停用词文件
    Stop_Word_Path = "stopwords.txt"

    # 训练数据百分比
    Training_Data_Percentage = 0.7

    def __init__(self) -> None:
        super().__init__()

        # 通过环境变量确认数据文件的路径
        user_do_main = os.environ.get("USERDOMAIN")
        if user_do_main == "LS2DL0YFY3IVX3W":
            Bayer2Test.File_Path = "F:/github-repository/dataset/阿里云-天池-Bully Messages Detection 欺凌消息检测/Bully Messages dataset.csv"
        else:
            Bayer2Test.File_Path = FileAndPathConstant.System_Drive + "/github-repository/dataset/阿里云-天池-Bully Messages Detection 欺凌消息检测/Bully Messages dataset.csv"

    def read_file_and_create_vocabulary_set(self, file_path: str) -> set:
        """
        从文件中读取数据，并创建词汇表
        """
        Logger.info("开始从文件【" + file_path + "】中读取数据，并创建词汇表")

        # 读停用词
        stop_word_list = self.read_stop_word(Bayer2Test.Stop_Word_Path)

        # 创建词汇表
        vocabulary_set = set()
        with codecs.open(file_path, encoding='utf-8') as f:
            for row_ordered_dict in csv.DictReader(f, skipinitialspace=True):
                segment_generator = jieba.cut(row_ordered_dict[Bayer2Test.Data_Column], use_paddle=True)  # 使用paddle模式
                segment_list = list(segment_generator)
                for word in segment_list:
                    if word not in stop_word_list:
                        vocabulary_set.add(word)
        return vocabulary_set

    def read_stop_word(self, stop_word_path: str) -> list:
        """
        读停用词文件
        """
        Logger.info("读停用词文件【" + stop_word_path + "】")

        stop_word_list = list()
        with open(stop_word_path, 'r', encoding='utf-8') as f:
            while True:
                raw_line = f.readline()
                if not raw_line:
                    break
                stop_word_list.append(raw_line.replace("\n", ""))
        return stop_word_list

    def create_training_data_set_and_testing_data_set(self, vocabulary_set, file_path: str) -> list:
        """
        创建训练数据集和标签、测试数据集和标签
        """
        Logger.info("创建训练数据集和标签、测试数据集和标签")

        # 所有数据和标签
        all_data_set = []
        all_data_label = []

        unsort_data_set = set()
        with codecs.open(file_path, encoding='utf-8') as f:
            for row_ordered_dict in csv.DictReader(f, skipinitialspace=True):
                word_generator = jieba.cut(row_ordered_dict[Bayer2Test.Data_Column], use_paddle=True)
                label_generator = jieba.cut(row_ordered_dict[Bayer2Test.Label_Column], use_paddle=True)

                # 向量
                word_list = list(word_generator)
                row = [0 for i in vocabulary_set]
                for index, vocabulary in enumerate(vocabulary_set):
                    if vocabulary in word_list:
                        row[index] = 1
                # all_data_set.clear()
                # all_data_set.append(row)

                # 标签
                label_list = list(label_generator)
                all_data_label.clear()
                all_data_label.append(label_list[0])

                temp = label_list[0]
                row.append(label_list[0])
                unsort_data_set.add(tuple(row))

        for data in unsort_data_set:
            temp_ = data[-1]
            print(temp_)
        # 划分训练数据和标签、测试数据和标签
        training_data_number = int(len(all_data_set) * Bayer2Test.Training_Data_Percentage)
        training_data_set = all_data_set[:training_data_number]
        training_data_label = all_data_label[:training_data_number]
        testing_data_set = all_data_set[training_data_number:]
        testing_data_label = all_data_label[training_data_number:]

        return training_data_set, training_data_label, testing_data_set, testing_data_label


if __name__ == "__main__":
    bayes2_test = Bayer2Test()

    # 从文件中读取数据，并创建词汇表
    vocabulary_set = bayes2_test.read_file_and_create_vocabulary_set(Bayer2Test.File_Path)

    # 创建训练数据集和标签、测试数据集和标签
    training_data_set, training_data_label, testing_data_set, testing_data_label = \
        bayes2_test.create_training_data_set_and_testing_data_set(vocabulary_set, Bayer2Test.File_Path)
